# REPLICATION CODE FOR THE MAIN ANALYSIS ON PEER SOCIALISATION AMONG ELITE BUSINESS STUDENTS
# Note: Replicators must install dplyr, ggplot2, psych, cobalt, huxtable, lmtest, tidyr, ggeffects
# Author of script: Hilma Lindskog with contributions from Sofiya Voytiv
# Contact: Hilma.lindskog@gu.se

# Set working directory 
setwd()

# Load dataset 
dataset <- read_excel("?????")
regression_sorted <- arrange(dataset, ID)


###########################
# Data prep
###########################
# Identiy cohort and school 
regression_sorted <- select(regression_sorted) %>%
  mutate(ENR = case_when(
    ENR == "This is my first academic year" ~ 1,
    ENR == "This is my second academic year" ~ 2,
    ENR == "This is my third academic year" ~ 3,
    TRUE ~ NA
  ))


#Identify cohort with ENR and SCHOOL (first, second, third)

library(dplyr)
regression_sorted <- regression_sorted %>%
  group_by(ID) %>%
  mutate(
    COHORTM = case_when(ENR == 1 & Source == "w1" ~ "Cohort 1",
                       ENR == 2 & Source == "w2" ~ "Cohort 1",
                       ENR == 3 & Source == "w3" ~ "Cohort 1",
                       ENR == 4 & Source == "w4" ~ "Cohort 1",
                       ENR == 1 & Source == "w2" ~ "Cohort 2",
                       ENR == 2 & Source == "w3" ~ "Cohort 2",
                       ENR == 3 & Source == "w4" ~ "Cohort 2",
                       ENR == 1 & Source == "w3" ~ "Cohort 3",
                       ENR == 2 & Source == "w4" ~ "Cohort 3",
                       TRUE ~ NA_character_))

# Recode the SCHOOL variable
regression_sorted$SCHOOL <- recode(regression_sorted$SCHOOL,
                                   "Cinek/Indek KTH" = "KTH",
                                   "KTH" = "KTH",
                                   "GU" = "GU",
                                   "School of Business, Economics and Law at the University of Gothenburg" = "GU",
                                   "Hanken" = "Hanken",
                                   "Hanken School of Economics" = "Hanken",
                                   "HHS" = "HHS",
                                   "Stockholm School of Economics" = "HHS")

# Convert the SCHOOL variable
regression_sorted$SCHOOL <- factor(regression_sorted$SCHOOL, levels = c("KTH", "GU", "Hanken", "HHS"))
table(regression_sorted$SCHOOL)

#School + cohort
regression_sorted <- regression_sorted %>%
  group_by(ID) %>%
  mutate(
    SCOHORT = case_when(COHORTM == "Cohort 1" & SCHOOL == "KTH" ~ "KTH1",
                        COHORTM == "Cohort 1" & SCHOOL == "Hanken" ~ "Hanken1",
                        COHORTM == "Cohort 1" & SCHOOL == "GU" ~ "GU1",
                        COHORTM == "Cohort 1" & SCHOOL == "HHS" ~ "HHS1",
                        COHORTM == "Cohort 2" & SCHOOL == "KTH" ~ "KTH2",
                        COHORTM == "Cohort 2" & SCHOOL == "Hanken" ~ "Hanken2",
                        COHORTM == "Cohort 2" & SCHOOL == "GU" ~ "GU2",
                        COHORTM == "Cohort 2" & SCHOOL == "HHS" ~ "HHS2",
                        COHORTM == "Cohort 3" & SCHOOL == "KTH" ~ "KTH3",
                        COHORTM == "Cohort 3" & SCHOOL == "Hanken" ~ "Hanken3",
                        COHORTM == "Cohort 3" & SCHOOL == "GU" ~ "GU3",
                        COHORTM == "Cohort 3" & SCHOOL == "HHS" ~ "HHS3",
                        TRUE ~ NA_character_))
table(regression_sorted$SCOHORT)


## Define and clean outcome variables: Absolute deviation between individuals attitude and the average attitude in the school

library(dplyr)

# Generate new dependent variables - REFUGEES
SCOHORT_averagesR <- aggregate(POL_REFU ~ SCOHORT, regression_sorted, FUN = mean)
regression_sorted <- merge(regression_sorted, SCOHORT_averagesR, by = "SCOHORT", suffixes = c("", "_SCOHORT"))
regression_sorted$deviationrefu <- abs(regression_sorted$POL_REFU - regression_sorted$POL_REFU_SCOHORT)
table(regression_sorted$deviationrefu)

# WOMEN 
SCOHORT_averagesW <- aggregate(POL_WOMEN ~ SCOHORT, regression_sorted, FUN = mean)
regression_sorted <- merge(regression_sorted, SCOHORT_averagesW, by = "SCOHORT", suffixes = c("", "_SCOHORT"))
regression_sorted$deviationwomen <- abs(regression_sorted$POL_WOMEN - regression_sorted$POL_WOMEN_SCOHORT)
table(regression_sorted$deviationwomen)

# TAXES
SCOHORT_averagesT <- aggregate(POL_TAXES ~ SCOHORT, regression_sorted, FUN = mean)
regression_sorted <- merge(regression_sorted, SCOHORT_averagesT, by = "SCOHORT", suffixes = c("", "_SCOHORT"))
regression_sorted$deviationtaxes <- abs(regression_sorted$POL_TAXES - regression_sorted$POL_TAXES_SCOHORT)
table(regression_sorted$deviationtaxes)

# DIVIDENTS
SCOHORT_averagesD <- aggregate(POL_DIVIDENT ~ SCOHORT, regression_sorted, FUN = mean)
regression_sorted <- merge(regression_sorted, SCOHORT_averagesD, by = "SCOHORT", suffixes = c("", "_SCOHORT"))
regression_sorted$deviationdiv <- abs(regression_sorted$POL_DIVIDENT - regression_sorted$POL_DIVIDENT_SCOHORT)
table(regression_sorted$deviationdiv)


############################################
## Define and clean independent variables ##
############################################

# Remove missing values 
regression_sorted <- regression_sorted[!is.na(regression_sorted$POL_REFU), ]
regression_sorted <- regression_sorted[!is.na(regression_sorted$POL_WOMEN), ]
regression_sorted <- regression_sorted[!is.na(regression_sorted$POL_TAXES), ]
regression_sorted <- regression_sorted[!is.na(regression_sorted$POL_DIVIDENT), ]
regression_sorted <- regression_sorted[!is.na(regression_sorted$SOC_ANO), ]
regression_sorted <- regression_sorted[!is.na(regression_sorted$SOC_FRI), ]
regression_sorted <- regression_sorted[!is.na(regression_sorted$SOC_HIGH), ]
regression_sorted <- regression_sorted[!is.na(regression_sorted$GENDER), ]
regression_sorted <- regression_sorted[!is.na(regression_sorted$BIRTHY), ]

#RECODE GENDER
regression_sorted$GENDER[regression_sorted$GENDER ==1 | regression_sorted$ASGENDER == 1]<-0
regression_sorted$GENDER[regression_sorted$GENDER ==2 | regression_sorted$ASGENDER == 2]<-1
regression_sorted$GENDER[regression_sorted$GENDER == 3]<-NA
regression_sorted$GENDER[regression_sorted$GENDER == 4]<-NA
table(regression_sorted$GENDER)

# Create a new variable called "Age" based on the respondent's birth year
regression_sorted$age <- 2023 - regression_sorted$BIRTHY
regression_sorted$age[regression_sorted$age == 1]<-NA
table(regression_sorted$age)


# GENERATE VARIABLES FOR SOCIAL BEHAVIOR
# Convert FMEET* (frequency of social contacts) to numeric
regression_sorted$FMEET1 <- as.numeric(regression_sorted$FMEET1)
regression_sorted$FMEET2 <- as.numeric(regression_sorted$FMEET2)
regression_sorted$FMEET3 <- as.numeric(regression_sorted$FMEET3)
regression_sorted$FMEET4 <- as.numeric(regression_sorted$FMEET4)
regression_sorted$FMEET5 <- as.numeric(regression_sorted$FMEET5)

#SOCIAL INTERACTIONS
regression_sorted$friendfreq <- regression_sorted$FMEET1 + regression_sorted$FMEET2 + regression_sorted$FMEET3 + regression_sorted$FMEET5 
table(regression_sorted$friendfreq)

# Reverse the continuous variable 
regression_sorted$friendfreq_reversed <- max(regression_sorted$friendfreq) + 1 - regression_sorted$friendfreq
table(regression_sorted$friendfreq_reversed)

# Define the old and new range
old_min <- 2
old_max <- 21
new_min <- 1
new_max <- 10

# Rescale the variable- contunious
regression_sorted$friendfreq_rescaled <- ((regression_sorted$friendfreq_reversed - old_min) * (new_max - new_min)) / (old_max - old_min) + new_min
table(regression_sorted$friendfreq_rescaled)

#Group into three categories
regression_sorted$friendfreq_group <- cut(regression_sorted$friendfreq_reversed, breaks = c(2, 9, 16, 22), include.lowest = TRUE, labels = c("Low engagement", "Medium", "High engagement"))
table(regression_sorted$friendfreq_group)


###########################################################################
######## DISTRIBUTION OF ATTITUDES ACROSS YEARS SPENT IN SCHOOL ###########
###########################################################################

### DISTRIBUTION WITH MEAN PLOTS
# plyr, gridextra and scales necessary libraries

means <- ddply(regression_sorted, "ENR", summarise, ref.mean = mean(POL_REFU))
means.w <- ddply(regression_sorted, "ENR", summarise, wom.mean = mean(POL_WOMEN))
means.tax <- ddply(regression_sorted, "ENR", summarise, tax.mean = mean(POL_TAXES))
means.div <- ddply(regression_sorted, "ENR", summarise, div.mean = mean(POL_DIVIDENT))


# REFUGEES
refu <- ggplot(regression_sorted, aes(POL_REFU, group = ENR, fill = factor(ENR))) +
  geom_bar(aes(y = after_stat(prop)), stat = "count", width = 0.7) +
  scale_y_continuous(labels = scales::percent) +
  ylab(" ") +
  xlab(" ") +
  facet_grid(~ENR) +
  scale_fill_manual(values = cbp3) +
  theme_minimal() +
  labs(
    title = "Refugees"
  ) +
  theme(
    legend.position = "none",
    axis.text.x = element_blank() # Remove the legend
  )

refu

# WOMEN
women <- ggplot(regression_sorted, aes(POL_WOMEN, group = ENR, fill = factor(ENR))) +
  geom_bar(aes(y = after_stat(prop)), stat = "count", width = 0.7) +
  scale_y_continuous(labels = scales::percent) +
  ylab(" ") +
  xlab(" ") +
  facet_grid(~ENR) +
  scale_fill_manual(values = cbp3) +
  theme_minimal() +
  labs(
    title = "Women Quotas"
  ) +
  theme(
    legend.position = "none",
    ,
    axis.text.x = element_blank(),
    axis.text.y = element_blank()# Remove the legend
  )

women

# TAXES 
taxes <- ggplot(regression_sorted, aes(POL_TAXES, group = ENR, fill = factor(ENR))) +
  geom_bar(aes(y = after_stat(prop)), stat = "count", width = 0.7) +
  scale_y_continuous(labels = scales::percent) +
  ylab(" ") +
  xlab(" ") +
  facet_grid(~ENR) +
  scale_fill_manual(values = cbp3) +
  theme_minimal() +
  labs(
    title = "Taxes"
  ) +
  theme(
    legend.position = "none",# Remove the legend
  )

taxess

# DIVIDENTS
dividents <- ggplot(regression_sorted, aes(POL_DIVIDENT, group = ENR, fill = factor(ENR))) +
  geom_bar(aes(y = after_stat(prop)), stat = "count", width = 0.7) +
  scale_y_continuous(labels = scales::percent) +
  ylab(" ") +
  xlab(" ") +
  facet_grid(~ENR) +
  scale_fill_manual(values = cbp3) +
  theme_minimal() +
  labs(
    title = "Dividends"
  ) +
  theme(
    legend.position = "none",
    axis.text.y = element_blank()# Remove the legend # Remove the legend
  )

dividents

distribution <- grid.arrange(refu, women, taxes, dividents, common.legend = TRUE, legend = "right")
distribution
ggsave("dist.jpeg", distribution, width = 10, height = 8, dpi = 300)

#################################################
## MEAN TABLES AND STANDARD DEVIATIONS         ##
#################################################
# Group the data by ENR and calculate mean, standard deviation, and N for POL_DIVIDENT
enr_summary <- regression_sorted %>%
  group_by(ENR) %>%
  summarise(mean_POL_DIVIDENT = mean(POL_DIVIDENT, na.rm = TRUE),
            sd_POL_DIVIDENT = sd(POL_DIVIDENT, na.rm = TRUE),
            N = n())

# Group the data by ENR and calculate mean, standard deviation, and N for POL_DIVIDENT
enr_summary <- regression_sorted %>%
  group_by(ENR) %>%
  summarise(mean_POL_REFU = mean(POL_REFU, na.rm = TRUE),
            sd_POL_REFU = sd(POL_REFU, na.rm = TRUE),
            N = n())


# Group the data by ENR and calculate mean, standard deviation, and N for POL_DIVIDENT
enr_summary <- regression_sorted %>%
  group_by(ENR) %>%
  summarise(mean_POL_WOMEN = mean(POL_WOMEN, na.rm = TRUE),
            sd_POL_WOMEN = sd(POL_WOMEN, na.rm = TRUE),
            N = n())

# Group the data by ENR and calculate mean, standard deviation, and N for POL_DIVIDENT
enr_summary <- regression_sorted %>%
  group_by(ENR) %>%
  summarise(mean_POL_TAXES = mean(POL_TAXES, na.rm = TRUE),
            sd_POL_TAXES = sd(POL_TAXES, na.rm = TRUE),
            N = n())


##############################
#### REGRESSION ANALYSIS #####
##############################
### Load stargazer for this analysis
# For absolute attitudinal effect, change the dependent to POL_REFU, POL_WOMEN, POL_TAXES and POL_DIVIDENTS and not the distance to the mean variables

##OLS REGRESSION (TEST 2, deviation from mean as dependent) - HERE WE NEED TO KNOW THE SCHOOL DIFFERENCES
linear <- lm(deviationrefu ~ factor(GENDER) + factor(friendfreq_group) + SOC_ANO + SOC_FRI + SOC_HIGH + age + factor(SCHOOL) + factor(ENR), data = regression_sorted)
linear1_2 <- lm(deviationwomen ~ factor(GENDER) + factor(friendfreq_group) + SOC_ANO + SOC_FRI + SOC_HIGH + age + factor(SCHOOL) + factor(ENR), data = regression_sorted)
linear1_3 <- lm(deviationtaxes ~ factor(GENDER) + factor(friendfreq_group) + SOC_ANO + SOC_FRI + SOC_HIGH + age + factor(SCHOOL) + factor(ENR), data = regression_sorted)
linear1_4 <- lm(deviationdiv ~ factor(GENDER) + factor(friendfreq_group) + SOC_ANO + SOC_FRI + SOC_HIGH + age + factor(SCHOOL) + factor(ENR), data = regression_sorted)

summary(linear)
summary(linear1_2)
summary(linear1_3)
summary(linear1_4)

# Run stargazer and save the output to a variable
table <- stargazer(linear, linear1_2, linear1_3, linear1_4,
                   type = "text",
                   title = "Regression Results",
                   align = TRUE,
                   dep.var.caption = "Attitudes",
                   dep.var.labels.include = FALSE,
                   se = list(summary(linear)$coefficients[, "Std. Error"],
                             summary(linear1_2)$coefficients[, "Std. Error"],
                             summary(linear1_3)$coefficients[, "Std. Error"],
                             summary(linear1_4)$coefficients[, "Std. Error"]),
                   p = list(summary(linear)$coefficients[, "Pr(>|t|)"],
                            summary(linear1_2)$coefficients[, "Pr(>|t|)"],
                            summary(linear1_3)$coefficients[, "Pr(>|t|)"],
                            summary(linear1_4)$coefficients[, "Pr(>|t|)"]),
                   star.cutoffs = c(0.05, 0.01, 0.001))

# Write the table to a text file
write.table(table, file = "regression_results.txt", row.names = FALSE, sep = ',', quote = FALSE)



######################################################
######## DIFFERENCE IN DIFFERENCE ANALYSIS ###########
#######################################################

###### DATA PREPARATIONS
#Generate time and panel- variables
# Time
df <- regression_sorted %>% 
  group_by(ID) %>% 
  mutate(Time = dplyr::row_number())
table(df$Time)
is.numeric(df$Time)

#Generate panel variable
# Group data by ID
mastera_grouped <- df %>% 
  group_by(ID)

# Generate new variable based on count of IDs
mastera_new <- mastera_grouped %>% 
  mutate(panel = ifelse(n() == 1, 0, 1))
table(mastera_new$panel)

# Filter the dataframe to include IDs included in the panel
mastera_new_panel <- subset(mastera_new, panel == 1)

# Filter the dataframe to include data only from the first and second time points
mastera_new_panel <- subset(mastera_new_panel, Time %in% c(1, 2) & ENR %in% c(1, 2))
table(mastera_new_panel$Time)


#Generate a treatment variable 
mastera_new_panel <- mastera_new_panel %>%
  group_by(ID) %>%
  filter(n() > 1) %>%
  mutate(covid = case_when(
    "w1" %in% Source & "w2" %in% Source ~ 0,
    "w2" %in% Source & "w3" %in% Source ~ 1,
    TRUE ~ NA_real_
  )) %>%
  group_by(ID) %>%
  fill(covid, .direction = "down") %>%
  ungroup()
table(mastera_new_updated$covid)

#CHECK COMPOSITION
mastera_new_updated <- mastera_new_updated[complete.cases(mastera_new_updated$covid), ]

covs <- subset(mastera_new_updated, select = c (covid, SCHOOL, GENDER, age, RFAEDU))
balance_table <- bal.tab(covid ~ covs, data = mastera_new_updated, 
                         disp = c("means", "sds"), un = TRUE, 
                         stats = c("mean.diffs", "variance.ratios"),
                         s.d.denom = "0")
print(balance_table)

#Recode time
mastera_new_updated$Time <- ifelse(mastera_new_updated$Time == 1, 0, 1)
mastera_new_updated$Time


########################################################################################################################
#Diff-in-diff ANALYSIS: ABSOLUTE DEVIATIATION (For absolute attitudinal change, exhange the dependent variables to their linear form POL_REFU, POL_WOMEN, POL_TAXES, POL_DIVIDENTS)
########################################################################################################################

library(lmtest)
mod1 <- lm(deviationrefu ~ covid*Time + factor(SCHOOL), data = mastera_new_updated) 
mod4 <- lm(deviationwomen ~ covid*Time + factor(SCHOOL), data = mastera_new_updated) 
mod5 <- lm(deviationtaxes ~ covid*Time + factor(SCHOOL), data = mastera_new_updated) 
mod6 <- lm(deviationdiv ~ covid*Time + factor(SCHOOL), data = mastera_new_updated) 
huxreg(mod1, mod4, mod5, mod6, number_format = 2)


#ILLUSTRATION DIFF IN DIFF

#Color scheme chosen for all plots (ok for colorblind people)

cbp1 <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
                   "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
cbp2<-c( "#D55E00","#0072B2")
cbp3<-c( "#D55E00","#0072B2","#999999")


#GRAPHICS for dif in dif

#Refugees
dat <- ggpredict(mod1, terms = c("Time", "covid"))
dat$x <- factor(dat$x)
levels(dat$x) <- c("T1","T2")
dat$Cohort <- factor(dat$group, labels = c("Campus", "Online"))

p1 <- ggplot(dat, aes(x=x, y=predicted, group=Cohort, col=Cohort)) +
  geom_point(position=position_dodge(0.05)) +
  geom_line(aes(linetype=Cohort)) +
  geom_errorbar(aes(ymin=conf.low, ymax=conf.high), width=.2, position=position_dodge(0.05)) +
  scale_y_continuous(expand = c(0, 0.1), limits = c(0, 2), breaks = seq(0, 2, by = 0.5)) +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12)) +
  labs(x = "", y = "") +
  theme(panel.background = element_rect(fill = "white"),
        legend.key = element_rect(fill = "white"),
        axis.line = element_line(linewidth = 0.2, colour = "black"),
        axis.line.y = element_line(linewidth = 0.2, colour = "black"),
        axis.text.x = element_blank(),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 12, family = "Times New Roman"),
        panel.grid.major = element_blank(),  # Remove major grid lines
        panel.grid.minor = element_blank()) +
  guides(colour = guide_legend(title = "Cohort", 
                               labels = c("Campus", "Online"))) +
  ggtitle("Refugees")
p1 + theme(plot.title = element_text(hjust = 0.5))

#GRAPHICS Women
dat4 <- ggpredict(mod4, terms = c("Time", "covid"))
dat4$x <- factor(dat4$x)
levels(dat4$x) <- c("T1","T2")
dat4$Cohort <- factor(dat4$group, labels = c("Campus", "Online"))

p2 <- ggplot(dat4, aes(x=x, y=predicted, group=Cohort, col=Cohort)) +
  geom_point(position=position_dodge(0.05)) +
  geom_line(aes(linetype=Cohort)) +
  geom_errorbar(aes(ymin=conf.low, ymax=conf.high), width=.2, position=position_dodge(0.05)) +
  scale_y_continuous(expand = c(0, 0.1), limits = c(0, 2), breaks = seq(0, 2, by = 0.5)) +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12)) +
  labs(x = "", y = "") +
  theme(panel.background = element_rect(fill = "white"),
        legend.key = element_rect(fill = "white"),
        axis.line = element_line(size = 0.2, colour = "black"),
        axis.line.y = element_line(size = 0.2, colour = "black"),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 12, family = "Times New Roman"),
        panel.grid.major = element_blank(),  # Remove major grid lines
        panel.grid.minor = element_blank()) +
  guides(colour = guide_legend(title = "Cohort", 
                               labels = c("Campus", "Online"))) +
  ggtitle("Women quotas")
p2 + theme(plot.title = element_text(hjust = 0.5))


#GRAPHICS Taxes
dat5 <- ggpredict(mod5, terms = c("Time", "covid"))
dat5$x <- factor(dat5$x)
levels(dat5$x) <- c("T1","T2")
dat5$Cohort <- factor(dat5$group, labels = c("Campus", "Online"))

p3 <- ggplot(dat5, aes(x=x, y=predicted, group=Cohort, col=Cohort)) +
  geom_point(position=position_dodge(0.05)) +
  geom_line(aes(linetype=Cohort)) +
  geom_errorbar(aes(ymin=conf.low, ymax=conf.high), width=.2, position=position_dodge(0.05)) +
  scale_y_continuous(expand = c(0, 0.1), limits = c(0, 2), breaks = seq(0, 2, by = 0.5)) +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12)) +
  labs(x = "", y = "") +
  theme(panel.background = element_rect(fill = "white"),
        legend.key = element_rect(fill = "white"),
        axis.line = element_line(size = 0.2, colour = "black"),
        axis.line.y = element_line(size = 0.2, colour = "black"),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 12, family = "Times New Roman"),
        panel.grid.major = element_blank(),  # Remove major grid lines
        panel.grid.minor = element_blank()) +
  guides(colour = guide_legend(title = "Cohort", 
                               labels = c("Campus", "Online"))) +
  ggtitle("Taxes")
p3 + theme(plot.title = element_text(hjust = 0.5))

#GRAPHICS Dividends
dat6 <- ggpredict(mod6, terms = c("Time", "covid"))
dat6$x <- factor(dat6$x)
levels(dat6$x) <- c("T1","T2")
dat6$Cohort <- factor(dat6$group, labels = c("Campus", "Online"))


p4 <- ggplot(dat6, aes(x=x, y=predicted, group=Cohort, col=Cohort)) +
  geom_point(position=position_dodge(0.05)) + 
  geom_line(aes(linetype=Cohort)) +
  geom_errorbar(aes(ymin=conf.low, ymax=conf.high), width=.3, position=position_dodge(0.05)) +
  scale_y_continuous(expand = c(0, 0.1), limits = c(0, 2), breaks = seq(0, 2, by = 0.5)) +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12)) +
  labs(x = "", y = "") +
  theme(panel.background = element_rect(fill = "white"),
        legend.key = element_rect(fill = "white"),
        axis.line = element_line(linewidth = 0.3, colour = "black"),
        axis.line.y = element_line(linewidth = 0.3, colour = "black"),
        axis.text.y = element_blank(),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5, size = 12, family = "Times New Roman"),
        panel.grid.major = element_blank(),  # Remove major grid lines
        panel.grid.minor = element_blank()) +
  guides(colour = guide_legend(title = "Cohort", 
                               labels = c("Campus", "Online"))) +
  ggtitle("Welfare dividends")
p4 + theme(plot.title = element_text(hjust = 0.5))


### ARRANGE THEM
p1<-p1 + scale_color_manual(values = cbp2)+theme_light()
p2<-p2 + scale_color_manual(values = cbp2)+theme_light()
p3<-p3 + scale_color_manual(values = cbp2)+theme_light()
p4<-p4 + scale_color_manual(values = cbp2)+theme_light()

g <- ggarrange(p1, p2, p3, p4, ncol = 2, nrow = 2, common.legend = TRUE, legend = "right")

# Set background and title properties for each individual plot
p1 <- p1 +
  theme(
    plot.background = element_rect(fill = "white"),
    plot.title = element_text(hjust = 0.5, size = 16)
  )

p2 <- p2 +
  theme(
    plot.background = element_rect(fill = "white"),
    plot.title = element_text(hjust = 0.5, size = 16)
  )

p3 <- p3 +
  theme(
    plot.background = element_rect(fill = "white"),
    plot.title = element_text(hjust = 0.5, size = 16)
  )

p4 <- p4 +
  theme(
    plot.background = element_rect(fill = "white"),
    plot.title = element_text(hjust = 0.5, size = 16)
  )

# Arrange the plots
g <- ggarrange(p1, p2, p3, p4, ncol = 2, nrow = 2, common.legend = TRUE, legend = "right")

# Set the background to white and center the titles for the entire arrangement
g + theme(
  plot.background = element_rect(fill = "white"),  # Set the background to white
  plot.title = element_text(hjust = 0.5, size = 16)  # Center the titles horizontally
)

ggsave("diff_in_diff.jpeg", g, width = 10, height = 8, dpi = 300)


